output.var = params$output.var
transform.abs = FALSE
log.pred = params$log.pred
norm.pred = FALSE
eda = params$eda
algo.forward.caret = params$algo.forward.caret
algo.backward.caret = params$algo.backward.caret
algo.stepwise.caret = params$algo.stepwise.caret
algo.LASSO.caret = params$algo.LASSO.caret
algo.LARS.caret = params$algo.LARS.caret
message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 8
## $ output.var : chr "y3"
## $ log.pred : logi FALSE
## $ eda : logi TRUE
## $ algo.forward.caret : logi FALSE
## $ algo.backward.caret: logi FALSE
## $ algo.stepwise.caret: logi FALSE
## $ algo.LASSO.caret : logi FALSE
## $ algo.LARS.caret : logi FALSE
# Setup Labels
output.var.tr = if (log.pred == TRUE) paste0(output.var,'.log') else output.var.tr = output.var
feat = read.csv('../../Data/features_highprec.csv')
labels = read.csv('../../Data/labels.csv')
predictors = names(dplyr::select(feat,-JobName))
data.ori = inner_join(feat,labels,by='JobName')
#data.ori = inner_join(feat,select_at(labels,c('JobName',output.var)),by='JobName')
cc = complete.cases(data.ori)
data.notComplete = data.ori[! cc,]
data = data.ori[cc,] %>% select_at(c(predictors,output.var,'JobName'))
message('Original cases: ',nrow(data.ori))
## Original cases: 10000
message('Non-Complete cases: ',nrow(data.notComplete))
## Non-Complete cases: 3020
message('Complete cases: ',nrow(data))
## Complete cases: 6980
summary(dplyr::select_at(data,c('JobName',output.var)))
## JobName y3
## Job_00001: 1 Min. : 95.91
## Job_00002: 1 1st Qu.:118.29
## Job_00003: 1 Median :124.03
## Job_00004: 1 Mean :125.40
## Job_00007: 1 3rd Qu.:131.06
## Job_00008: 1 Max. :193.73
## (Other) :6974
The Output Variable y3 shows right skewness, so will proceed with a log transformation
df=gather(select_at(data,output.var))
ggplot(df, aes(x=value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density()
#stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
ggplot(gather(select_at(data,output.var)), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
if(log.pred==TRUE) data[[output.var.tr]] = log(data[[output.var]],10) else
data[[output.var.tr]] = data[[output.var]]
df=gather(select_at(data,c(output.var,output.var.tr)))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=2)
ggplot(gather(select_at(data,c(output.var,output.var.tr))), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
Normalization of y3 using bestNormalize package. (suggested orderNorm) This is cool, but I think is too far for the objective of the project
t=bestNormalize::bestNormalize(data[[output.var]])
t
## Best Normalizing transformation with 6980 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - No transform: 2.9522
## - Box-Cox: 1.4738
## - Log_b(x+a): 2.0371
## - sqrt(x+a): 2.46
## - exp(x): 749.5901
## - arcsinh(x): 2.0373
## - Yeo-Johnson: 1.223
## - orderNorm: 1.1571
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6980 nonmissing obs and no ties
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 95.913 118.289 124.030 131.059 193.726
qqnorm(data[[output.var]])
qqnorm(predict(t))
orderNorm() is a rank-based procedure by which the values of a vector are mapped to their percentile, which is then mapped to the same percentile of the normal distribution. Without the presence of ties, this essentially guarantees that the transformation leads to a uniform distribution
data$x2byx1 = data$x2/data$x1
data$x6byx5 = data$x6/data$x5
data$x9byx7 = data$x9/data$x7
data$x10byx8 = data$x10/data$x8
data$x14byx12 = data$x14/data$x12
data$x15byx13 = data$x15/data$x13
data$x17byx16 = data$x17/data$x16
data$x19byx18 = data$x19/data$x18
data$x21byx20 = data$x21/data$x20
data$x23byx22 = data$x23/data$x22
data$x1log = log(data$x1)
data$x2log = log(data$x2)
data$x5log = log(data$x5)
data$x6log = log(data$x6)
data$x7log = log(data$x7)
data$x9log = log(data$x9)
data$x8log = log(data$x8)
data$x10log = log(data$x10)
data$x12log = log(data$x12)
data$x14log = log(data$x14)
data$x13log = log(data$x13)
data$x15log = log(data$x15)
data$x16log = log(data$x16)
data$x17log = log(data$x17)
data$x18log = log(data$x18)
data$x19log = log(data$x19)
data$x20log = log(data$x20)
data$x21log = log(data$x21)
data$x22log = log(data$x22)
data$x23log = log(data$x23)
data$x11log = log(data$x11)
data$x1sqinv = 1/(data$x1)^2
data$x5sqinv = 1/(data$x5)^2
data$x7sqinv = 1/(data$x7)^2
data$x8sqinv = 1/(data$x8)^2
data$x12sqinv = 1/(data$x12)^2
data$x13sqinv = 1/(data$x13)^2
data$x16sqinv = 1/(data$x16)^2
data$x18sqinv = 1/(data$x18)^2
data$x20sqinv = 1/(data$x20)^2
data$x22sqinv = 1/(data$x22)^2
predictors
## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10" "x11"
## [12] "x12" "x13" "x14" "x15" "x16" "x17" "x18" "x19" "x20" "x21" "x22"
## [23] "x23" "stat1" "stat2" "stat3" "stat4" "stat5" "stat6" "stat7" "stat8" "stat9" "stat10"
## [34] "stat11" "stat12" "stat13" "stat14" "stat15" "stat16" "stat17" "stat18" "stat19" "stat20" "stat21"
## [45] "stat22" "stat23" "stat24" "stat25" "stat26" "stat27" "stat28" "stat29" "stat30" "stat31" "stat32"
## [56] "stat33" "stat34" "stat35" "stat36" "stat37" "stat38" "stat39" "stat40" "stat41" "stat42" "stat43"
## [67] "stat44" "stat45" "stat46" "stat47" "stat48" "stat49" "stat50" "stat51" "stat52" "stat53" "stat54"
## [78] "stat55" "stat56" "stat57" "stat58" "stat59" "stat60" "stat61" "stat62" "stat63" "stat64" "stat65"
## [89] "stat66" "stat67" "stat68" "stat69" "stat70" "stat71" "stat72" "stat73" "stat74" "stat75" "stat76"
## [100] "stat77" "stat78" "stat79" "stat80" "stat81" "stat82" "stat83" "stat84" "stat85" "stat86" "stat87"
## [111] "stat88" "stat89" "stat90" "stat91" "stat92" "stat93" "stat94" "stat95" "stat96" "stat97" "stat98"
## [122] "stat99" "stat100" "stat101" "stat102" "stat103" "stat104" "stat105" "stat106" "stat107" "stat108" "stat109"
## [133] "stat110" "stat111" "stat112" "stat113" "stat114" "stat115" "stat116" "stat117" "stat118" "stat119" "stat120"
## [144] "stat121" "stat122" "stat123" "stat124" "stat125" "stat126" "stat127" "stat128" "stat129" "stat130" "stat131"
## [155] "stat132" "stat133" "stat134" "stat135" "stat136" "stat137" "stat138" "stat139" "stat140" "stat141" "stat142"
## [166] "stat143" "stat144" "stat145" "stat146" "stat147" "stat148" "stat149" "stat150" "stat151" "stat152" "stat153"
## [177] "stat154" "stat155" "stat156" "stat157" "stat158" "stat159" "stat160" "stat161" "stat162" "stat163" "stat164"
## [188] "stat165" "stat166" "stat167" "stat168" "stat169" "stat170" "stat171" "stat172" "stat173" "stat174" "stat175"
## [199] "stat176" "stat177" "stat178" "stat179" "stat180" "stat181" "stat182" "stat183" "stat184" "stat185" "stat186"
## [210] "stat187" "stat188" "stat189" "stat190" "stat191" "stat192" "stat193" "stat194" "stat195" "stat196" "stat197"
## [221] "stat198" "stat199" "stat200" "stat201" "stat202" "stat203" "stat204" "stat205" "stat206" "stat207" "stat208"
## [232] "stat209" "stat210" "stat211" "stat212" "stat213" "stat214" "stat215" "stat216" "stat217"
controlled.vars = colnames(data)[grep("^x",colnames(data))]
stat.vars = colnames(data)[grep("^stat",colnames(data))]
predictors = c(controlled.vars,stat.vars)
predictors
## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10"
## [11] "x11" "x12" "x13" "x14" "x15" "x16" "x17" "x18" "x19" "x20"
## [21] "x21" "x22" "x23" "x2byx1" "x6byx5" "x9byx7" "x10byx8" "x14byx12" "x15byx13" "x17byx16"
## [31] "x19byx18" "x21byx20" "x23byx22" "x1log" "x2log" "x5log" "x6log" "x7log" "x9log" "x8log"
## [41] "x10log" "x12log" "x14log" "x13log" "x15log" "x16log" "x17log" "x18log" "x19log" "x20log"
## [51] "x21log" "x22log" "x23log" "x11log" "x1sqinv" "x5sqinv" "x7sqinv" "x8sqinv" "x12sqinv" "x13sqinv"
## [61] "x16sqinv" "x18sqinv" "x20sqinv" "x22sqinv" "stat1" "stat2" "stat3" "stat4" "stat5" "stat6"
## [71] "stat7" "stat8" "stat9" "stat10" "stat11" "stat12" "stat13" "stat14" "stat15" "stat16"
## [81] "stat17" "stat18" "stat19" "stat20" "stat21" "stat22" "stat23" "stat24" "stat25" "stat26"
## [91] "stat27" "stat28" "stat29" "stat30" "stat31" "stat32" "stat33" "stat34" "stat35" "stat36"
## [101] "stat37" "stat38" "stat39" "stat40" "stat41" "stat42" "stat43" "stat44" "stat45" "stat46"
## [111] "stat47" "stat48" "stat49" "stat50" "stat51" "stat52" "stat53" "stat54" "stat55" "stat56"
## [121] "stat57" "stat58" "stat59" "stat60" "stat61" "stat62" "stat63" "stat64" "stat65" "stat66"
## [131] "stat67" "stat68" "stat69" "stat70" "stat71" "stat72" "stat73" "stat74" "stat75" "stat76"
## [141] "stat77" "stat78" "stat79" "stat80" "stat81" "stat82" "stat83" "stat84" "stat85" "stat86"
## [151] "stat87" "stat88" "stat89" "stat90" "stat91" "stat92" "stat93" "stat94" "stat95" "stat96"
## [161] "stat97" "stat98" "stat99" "stat100" "stat101" "stat102" "stat103" "stat104" "stat105" "stat106"
## [171] "stat107" "stat108" "stat109" "stat110" "stat111" "stat112" "stat113" "stat114" "stat115" "stat116"
## [181] "stat117" "stat118" "stat119" "stat120" "stat121" "stat122" "stat123" "stat124" "stat125" "stat126"
## [191] "stat127" "stat128" "stat129" "stat130" "stat131" "stat132" "stat133" "stat134" "stat135" "stat136"
## [201] "stat137" "stat138" "stat139" "stat140" "stat141" "stat142" "stat143" "stat144" "stat145" "stat146"
## [211] "stat147" "stat148" "stat149" "stat150" "stat151" "stat152" "stat153" "stat154" "stat155" "stat156"
## [221] "stat157" "stat158" "stat159" "stat160" "stat161" "stat162" "stat163" "stat164" "stat165" "stat166"
## [231] "stat167" "stat168" "stat169" "stat170" "stat171" "stat172" "stat173" "stat174" "stat175" "stat176"
## [241] "stat177" "stat178" "stat179" "stat180" "stat181" "stat182" "stat183" "stat184" "stat185" "stat186"
## [251] "stat187" "stat188" "stat189" "stat190" "stat191" "stat192" "stat193" "stat194" "stat195" "stat196"
## [261] "stat197" "stat198" "stat199" "stat200" "stat201" "stat202" "stat203" "stat204" "stat205" "stat206"
## [271] "stat207" "stat208" "stat209" "stat210" "stat211" "stat212" "stat213" "stat214" "stat215" "stat216"
## [281] "stat217"
All predictors show a Fat-Tail situation, where the two tails are very tall, and a low distribution around the mean. The orderNorm transformation can help (see [Best Normalizator] section)
Histograms
if (eda == TRUE){
cols = c('x11','x18','stat98','x7','stat110')
df=gather(select_at(data,cols))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=3)
# ggplot(gather(select_at(data,cols)), aes(sample=value)) +
# stat_qq()+
# facet_wrap(~key, scales = 'free',ncol=2)
lapply(select_at(data,cols),summary)
}
## $x11
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.000e-08 9.494e-08 1.001e-07 1.001e-07 1.052e-07 1.100e-07
##
## $x18
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.500 3.147 4.769 4.772 6.418 7.999
##
## $stat98
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.998619 -1.551882 -0.015993 -0.005946 1.528405 2.999499
##
## $x7
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.700 1.266 1.854 1.852 2.446 3.000
##
## $stat110
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.999543 -1.496865 -0.002193 -0.004129 1.504273 2.999563
Scatter plot vs. output variable **y3
if (eda == TRUE){
d = gather(dplyr::select_at(data,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light green',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=3)
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
All indicators have a strong indication of Fat-Tails
if (eda == TRUE){
df=gather(select_at(data,predictors))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=4)
}
if (eda == TRUE){
#chart.Correlation(select(data,-JobName), pch=21)
# https://stackoverflow.com/questions/27034655/how-to-use-dplyrarrangedesc-when-using-a-string-as-column-name
t=as.data.frame(round(cor(dplyr::select(data,-one_of(output.var.tr,'JobName'))
,select_at(data,output.var.tr)),4)) %>%
#rownames_to_column(var='variable') %>% filter(variable != !!output.var) %>% arrange(-y3.log)
rownames_to_column(var='variable') %>% filter(variable != !!output.var) %>% arrange(-!!sym(output.var.tr))
#DT::datatable(t)
message("Top Positive")
#kable(head(arrange(t,desc(y3.log)),20))
kable(head(arrange(t,desc(!!sym(output.var.tr))),20))
message("Top Negative")
#kable(head(arrange(t,y3.log),20))
kable(head(arrange(t,!!sym(output.var.tr)),20))
}
## Top Positive
## Top Negative
| variable | y3 |
|---|---|
| x18sqinv | -0.3419 |
| x19byx18 | -0.2301 |
| x7sqinv | -0.1976 |
| stat110 | -0.1514 |
| x4 | -0.0590 |
| x16sqinv | -0.0515 |
| x9byx7 | -0.0388 |
| stat13 | -0.0345 |
| stat41 | -0.0339 |
| stat14 | -0.0314 |
| stat149 | -0.0314 |
| stat113 | -0.0266 |
| stat106 | -0.0237 |
| stat4 | -0.0235 |
| stat146 | -0.0228 |
| x8sqinv | -0.0222 |
| stat186 | -0.0220 |
| stat214 | -0.0215 |
| stat22 | -0.0206 |
| stat20 | -0.0204 |
if (eda == TRUE){
#chart.Correlation(select(data,-JobName), pch=21)
t=as.data.frame(round(cor(dplyr::select(data,-one_of('JobName'))),4))
#DT::datatable(t,options=list(scrollX=T))
message("Showing only 10 variables")
kable(t[1:10,1:10])
}
## Showing only 10 variables
| x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| x1 | 1.0000 | 0.0034 | -0.0028 | 0.0085 | 0.0068 | 0.0159 | 0.0264 | -0.0012 | 0.0142 | 0.0013 |
| x2 | 0.0034 | 1.0000 | -0.0057 | 0.0004 | -0.0094 | -0.0101 | 0.0089 | 0.0078 | 0.0049 | -0.0214 |
| x3 | -0.0028 | -0.0057 | 1.0000 | 0.0029 | 0.0046 | 0.0006 | -0.0105 | -0.0002 | 0.0167 | -0.0137 |
| x4 | 0.0085 | 0.0004 | 0.0029 | 1.0000 | -0.0059 | 0.0104 | 0.0098 | 0.0053 | 0.0061 | -0.0023 |
| x5 | 0.0068 | -0.0094 | 0.0046 | -0.0059 | 1.0000 | 0.0016 | -0.0027 | 0.0081 | 0.0259 | -0.0081 |
| x6 | 0.0159 | -0.0101 | 0.0006 | 0.0104 | 0.0016 | 1.0000 | 0.0200 | -0.0157 | 0.0117 | -0.0072 |
| x7 | 0.0264 | 0.0089 | -0.0105 | 0.0098 | -0.0027 | 0.0200 | 1.0000 | -0.0018 | -0.0069 | -0.0221 |
| x8 | -0.0012 | 0.0078 | -0.0002 | 0.0053 | 0.0081 | -0.0157 | -0.0018 | 1.0000 | 0.0142 | -0.0004 |
| x9 | 0.0142 | 0.0049 | 0.0167 | 0.0061 | 0.0259 | 0.0117 | -0.0069 | 0.0142 | 1.0000 | 0.0149 |
| x10 | 0.0013 | -0.0214 | -0.0137 | -0.0023 | -0.0081 | -0.0072 | -0.0221 | -0.0004 | 0.0149 | 1.0000 |
Scatter plots with all predictors and the output variable (y3)
if (eda == TRUE){
d = gather(dplyr::select_at(data,c(predictors,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
No Multicollinearity among predictors
Showing Top predictor by VIF Value
if (eda == TRUE){
vifDF = usdm::vif(select_at(data,predictors)) %>% arrange(desc(VIF))
head(vifDF,75)
}
## Variables VIF
## 1 x16log 3915.716860
## 2 x5log 2571.710124
## 3 x20log 1921.098478
## 4 x16 1807.294814
## 5 x11log 1618.358705
## 6 x11 1618.225883
## 7 x8log 1613.197171
## 8 x5 1190.943766
## 9 x20 893.890464
## 10 x8 757.534871
## 11 x7log 551.189402
## 12 x1log 539.349800
## 13 x16sqinv 449.199883
## 14 x12log 372.200085
## 15 x13log 370.774138
## 16 x18log 322.556449
## 17 x5sqinv 299.888559
## 18 x7 267.236113
## 19 x1 258.274754
## 20 x20sqinv 226.968106
## 21 x8sqinv 189.779057
## 22 x12 184.896514
## 23 x13 182.552246
## 24 x22log 181.457292
## 25 x18 160.827041
## 26 x22 90.665166
## 27 x7sqinv 66.330161
## 28 x1sqinv 65.455060
## 29 x21 58.377628
## 30 x2 48.179048
## 31 x21log 46.405634
## 32 x12sqinv 46.116792
## 33 x13sqinv 46.088204
## 34 x2log 42.837124
## 35 x18sqinv 41.132277
## 36 x23 38.632854
## 37 x23log 36.191587
## 38 x6 35.428341
## 39 x17 29.532544
## 40 x22sqinv 23.868557
## 41 x21byx20 23.691232
## 42 x6log 22.342021
## 43 x17byx16 22.217307
## 44 x9 20.658657
## 45 x6byx5 20.334103
## 46 x10 19.142705
## 47 x19 18.834824
## 48 x14 17.360147
## 49 x2byx1 16.729954
## 50 x10byx8 15.103191
## 51 x9log 14.706157
## 52 x19log 14.450721
## 53 x15 13.697774
## 54 x14log 12.621835
## 55 x17log 12.529859
## 56 x23byx22 11.991025
## 57 x9byx7 11.582264
## 58 x19byx18 10.376255
## 59 x14byx12 9.863242
## 60 x15byx13 9.385733
## 61 x10log 9.045019
## 62 x15log 8.921556
## 63 stat52 1.076410
## 64 stat141 1.075522
## 65 stat11 1.073729
## 66 stat196 1.070109
## 67 stat156 1.069273
## 68 stat175 1.069096
## 69 stat166 1.069076
## 70 stat131 1.068780
## 71 stat186 1.068770
## 72 stat142 1.068682
## 73 stat178 1.068511
## 74 stat49 1.068381
## 75 stat199 1.067921
data.tr=data %>%
mutate(x18.sqrt = sqrt(x18))
cols=c('x18','x18.sqrt')
# ggplot(gather(select_at(data.tr,cols)), aes(value)) +
# geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
# geom_density() +
# facet_wrap(~key, scales = 'free',ncol=4)
d = gather(dplyr::select_at(data.tr,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#removing unwanted variables
data.tr=data.tr %>%
#dplyr::select_at(names(data.tr)[! names(data.tr) %in% c('x18','y3','JobName')])
dplyr::select_at(names(data.tr)[! names(data.tr) %in% c('JobName')])
data=data.tr
label.names=output.var.tr
# 0 for no interaction,
# 1 for Full 2 way interaction and
# 2 for Selective 2 way interaction
# 3 for Selective 3 way interaction
InteractionMode = 2
pca.vars = names(data)
pca.vars = pca.vars[!pca.vars %in% label.names]
# http://sshaikh.org/2015/05/06/parallelize-machine-learning-in-r-with-multi-core-cpus/
# #cl <- makeCluster(ceiling(detectCores()*0.5)) # use 75% of cores only, leave rest for other tasks
cl <- makeCluster(detectCores()*0.75) # use 75% of cores only, leave rest for other tasks
registerDoParallel(cl)
if(InteractionMode == 1){
pca.formula =as.formula(paste0('~(',paste0(pca.vars, collapse ='+'),')^2'))
pca.model = prcomp(formula=pca.formula,data=data[,pca.vars],center=T,scale.=T,retx = T)
#saveRDS(pca.model,'pca.model.rds')
}
if (InteractionMode == 0){
pca.model = prcomp(x=data[,pca.vars],center=T,scale.=T,retx = T)
}
if (InteractionMode >= 2 & InteractionMode <= 3){
controlled.vars = pca.vars[grep("^x",pca.vars)]
stat.vars = pca.vars[grep("^stat",pca.vars)]
if (InteractionMode >= 2){
interaction.form = paste0('~(',paste0(controlled.vars, collapse ='+'),')^2')
}
if (InteractionMode >= 3){
interaction.form = paste0('~(',paste0(controlled.vars, collapse ='+'),')^3')
}
no.interact.form = paste0(stat.vars, collapse ='+')
pca.formula = as.formula(paste(interaction.form, no.interact.form, sep = "+"))
pca.model = prcomp(formula=pca.formula,data=data[,pca.vars],center=T,scale.=T,retx = T)
}
stopCluster(cl)
registerDoSEQ() # register sequential engine in case you are not using this function anymore
targetCumVar = .9
pca.model$var = pca.model$sdev ^ 2 #eigenvalues
pca.model$pvar = pca.model$var / sum(pca.model$var)
pca.model$cumpvar = cumsum(pca.model$pvar )
pca.model$pcaSel = pca.model$cumpvar<=targetCumVar
pca.model$pcaSelCount = sum(pca.model$pcaSel)
pca.model$pcaSelTotVar = sum(pca.model$pvar[pca.model$pcaSel])
message(pca.model$pcaSelCount, " PCAs justify ",percent(targetCumVar)," of the total Variance. (",percent(pca.model$pcaSelTotVar),")")
## 164 PCAs justify 90.0% of the total Variance. (90.0%)
plot(pca.model$var,xlab="Principal component", ylab="Proportion of variance explained", type='b')
plot(cumsum(pca.model$pvar ),xlab="Principal component", ylab="Cumulative Proportion of variance explained", ylim=c(0,1), type='b')
screeplot(pca.model,npcs = pca.model$pcaSelCount)
screeplot(pca.model,npcs = pca.model$pcaSelCount,type='lines')
#summary(pca.model)
#pca.model$rotation
#creating dataset
data.pca = dplyr::select(data,!!label.names) %>%
dplyr::bind_cols(dplyr::select(as.data.frame(pca.model$x)
,!!colnames(pca.model$rotation)[pca.model$pcaSel])
)
data.pca = data.pca[sample(nrow(data.pca)),] # randomly shuffle data
split = sample.split(data.pca[,label.names], SplitRatio = 0.8)
data.train = subset(data.pca, split == TRUE)
data.test = subset(data.pca, split == FALSE)
plot.diagnostics <- function(model, train) {
plot(model)
residuals = resid(model) # Plotted above in plot(lm.out)
r.standard = rstandard(model)
r.student = rstudent(model)
df = data.frame(x=predict(model,train),y=r.student)
p=ggplot(data=df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_hline(yintercept = 0,size=1)+
ylab("Student Residuals") +
xlab("Predicted Values")+
ggtitle("Student Residual Plot")
plot(p)
df = data.frame(x=predict(model,train),y=r.standard)
p=ggplot(data=df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_hline(yintercept = c(-2,0,2),size=1)+
ylab("Student Residuals") +
xlab("Predicted Values")+
ggtitle("Student Residual Plot")
plot(p)
# Histogram
df=data.frame(r.student)
p=ggplot(data=df,aes(r.student)) +
geom_histogram(aes(y=..density..),bins = 50,fill='blue',alpha=0.6) +
stat_function(fun = dnorm, n = 100, args = list(mean = 0, sd = 1)) +
ylab("Density")+
xlab("Studentized Residuals")+
ggtitle("Distribution of Studentized Residuals")
plot(p)
# http://www.stat.columbia.edu/~martin/W2024/R7.pdf
# Influential plots
inf.meas = influence.measures(model)
# print (summary(inf.meas)) # too much data
# Leverage plot
lev = hat(model.matrix(model))
df=tibble::rownames_to_column(as.data.frame(lev),'id')
p=ggplot(data=df,aes(x=as.numeric(id),y=lev)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
ylab('Leverage - check') +
xlab('Index')
plot(p)
# Cook's Distance
cd = cooks.distance(model)
df=tibble::rownames_to_column(as.data.frame(cd),'id')
p=ggplot(data=df,aes(x=as.numeric(id),y=cd)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_text(data=filter(df,cd>15/nrow(train)),aes(label=id),check_overlap=T,size=3,vjust=-.5)+
ylab('Cooks distances') +
geom_hline(yintercept = c(4/nrow(train),0),size=1)+
xlab('Index')
plot(p)
print (paste("Number of data points that have Cook's D > 4/n: ", length(cd[cd > 4/nrow(train)]), sep = ""))
print (paste("Number of data points that have Cook's D > 1: ", length(cd[cd > 1]), sep = ""))
return(cd)
}
# function to set up random seeds
# Based on http://jaehyeon-kim.github.io/2015/05/Setup-Random-Seeds-on-Caret-Package.html
setCaretSeeds <- function(method = "cv", numbers = 1, repeats = 1, tunes = NULL, seed = 1701) {
#B is the number of resamples and integer vector of M (numbers + tune length if any)
B <- if (method == "cv") numbers
else if(method == "repeatedcv") numbers * repeats
else NULL
if(is.null(length)) {
seeds <- NULL
} else {
set.seed(seed = seed)
seeds <- vector(mode = "list", length = B)
seeds <- lapply(seeds, function(x) sample.int(n = 1000000
, size = numbers + ifelse(is.null(tunes), 0, tunes)))
seeds[[length(seeds) + 1]] <- sample.int(n = 1000000, size = 1)
}
# return seeds
seeds
}
train.caret.glmselect = function(formula, data, method
,subopt = NULL, feature.names
, train.control = NULL, tune.grid = NULL, pre.proc = NULL){
if(is.null(train.control)){
train.control <- trainControl(method = "cv"
,number = 10
,seeds = setCaretSeeds(method = "cv"
, numbers = 10
, seed = 1701)
,search = "grid"
,verboseIter = TRUE
,allowParallel = TRUE
)
}
if(is.null(tune.grid)){
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
tune.grid = data.frame(nvmax = 1:length(feature.names))
}
if (method == 'glmnet' && subopt == 'LASSO'){
# Will only show 1 Lambda value during training, but that is OK
# https://stackoverflow.com/questions/47526544/why-need-to-tune-lambda-with-carettrain-method-glmnet-and-cv-glmnet
# Another option for LASSO is this: https://github.com/topepo/caret/blob/master/RegressionTests/Code/lasso.R
lambda = 10^seq(-2,0, length =100)
alpha = c(1)
tune.grid = expand.grid(alpha = alpha,lambda = lambda)
}
if (method == 'lars'){
# https://github.com/topepo/caret/blob/master/RegressionTests/Code/lars.R
fraction = seq(0, 1, length = 100)
tune.grid = expand.grid(fraction = fraction)
pre.proc = c("center", "scale")
}
}
# http://sshaikh.org/2015/05/06/parallelize-machine-learning-in-r-with-multi-core-cpus/
# #cl <- makeCluster(ceiling(detectCores()*0.5)) # use 75% of cores only, leave rest for other tasks
cl <- makeCluster(detectCores()*0.75) # use 75% of cores only, leave rest for other tasks
registerDoParallel(cl)
set.seed(1)
# note that the seed has to actually be set just before this function is called
# settign is above just not ensure reproducibility for some reason
model.caret <- caret::train(formula
, data = data
, method = method
, tuneGrid = tune.grid
, trControl = train.control
, preProc = pre.proc
)
stopCluster(cl)
registerDoSEQ() # register sequential engine in case you are not using this function anymore
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
print("All models results")
print(model.caret$results) # all model results
print("Best Model")
print(model.caret$bestTune) # best model
model = model.caret$finalModel
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-nvmax) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=nvmax,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
# leap function does not support studentized residuals
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
id = rownames(model.caret$bestTune)
# Provides the coefficients of the best model
# regsubsets doens return a full model (see documentation of regsubset), so we need to recalcualte themodel
# https://stackoverflow.com/questions/13063762/how-to-obtain-a-lm-object-from-regsubsets
print("Coefficients of final model:")
coefs <- coef(model, id=id)
#calculate the model to the the coef intervals
nams <- names(coefs)
nams <- nams[!nams %in% "(Intercept)"]
response <- as.character(formula[[2]])
form <- as.formula(paste(response, paste(nams, collapse = " + "), sep = " ~ "))
mod <- lm(form, data = data)
#coefs
#coef(mod)
print(car::Confint(mod))
return(list(model = model,id = id, residPlot = residPlot, residHistogram=residHistogram
,modelLM=mod))
}
if (method == 'glmnet' && subopt == 'LASSO'){
print(model.caret)
print(plot(model.caret))
print(model.caret$bestTune)
print(model.caret$results)
model=model.caret$finalModel
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-lambda) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=lambda,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
print("Coefficients")
#no interval for glmnet: https://stackoverflow.com/questions/39750965/confidence-intervals-for-ridge-regression
t=coef(model,s=model.caret$bestTune$lambda)
model.coef = t[which(t[,1]!=0),]
print(as.data.frame(model.coef))
id = NULL # not really needed but added for consistency
return(list(model = model.caret,id = id, residPlot = residPlot, metricsPlot=metricsPlot ))
}
if (method == 'lars'){
print(model.caret)
print(plot(model.caret))
print(model.caret$bestTune)
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-fraction) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=fraction,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
print("Coefficients")
t=coef(model.caret$finalModel,s=model.caret$bestTune$fraction,mode='fraction')
model.coef = t[which(t!=0)]
print(model.coef)
id = NULL # not really needed but added for consistency
return(list(model = model.caret,id = id, residPlot = residPlot, residHistogram=residHistogram))
}
}
# https://stackoverflow.com/questions/48265743/linear-model-subset-selection-goodness-of-fit-with-k-fold-cross-validation
# changed slightly since call[[2]] was just returning "formula" without actually returnign the value in formula
predict.regsubsets <- function(object, newdata, id, formula, ...) {
#form <- as.formula(object$call[[2]])
mat <- model.matrix(formula, newdata) # adds intercept and expands any interaction terms
coefi <- coef(object, id = id)
xvars <- names(coefi)
return(mat[,xvars]%*%coefi)
}
test.model = function(model, test, level=0.95
,draw.limits = FALSE, good = 0.1, ok = 0.15
,method = NULL, subopt = NULL
,id = NULL, formula, feature.names, label.names
,transformation = NULL){
## if using caret for glm select equivalent functionality,
## need to pass formula (full is ok as it will select subset of variables from there)
if (is.null(method)){
pred = predict(model, newdata=test, interval="confidence", level = level)
}
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
pred = predict.regsubsets(model, newdata = test, id = id, formula = formula)
}
if (method == 'glmnet' && subopt == 'LASSO'){
xtest = as.matrix(test[,feature.names])
pred=as.data.frame(predict(model, xtest))
}
if (method == 'lars'){
pred=as.data.frame(predict(model, newdata = test))
}
# Summary of predicted values
print ("Summary of predicted values: ")
print(summary(pred[,1]))
test.mse = mean((test[,label.names]-pred[,1])^2)
print (paste(method, subopt, "Test MSE:", test.mse, sep=" "))
test.rmse = sqrt(test.mse)
print (paste(method, subopt, "Test RMSE:", test.rmse, sep=" "))
if(log.pred == TRUE || norm.pred == TRUE){
# plot transformewd comparison first
df=data.frame(x=test[,label.names],y=pred[,1])
ggplot(df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_abline(slope=1,intercept=0,color='black',size=1) +
#scale_y_continuous(limits=c(min(df),max(df)))+
xlab("Actual (Transformed)")+
ylab("Predicted (Transformed)")
}
if (log.pred == FALSE && norm.pred == FALSE){
x = test[,label.names]
y = pred[,1]
}
if (log.pred == TRUE){
x = 10^test[,label.names]
y = 10^pred[,1]
}
if (norm.pred == TRUE){
x = predict(transformation, test[,label.names], inverse = TRUE)
y = predict(transformation, pred[,1], inverse = TRUE)
}
df=data.frame(x,y)
ggplot(df,aes(x,y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_abline(slope=c(1+good,1-good,1+ok,1-ok)
,intercept=rep(0,4),color=c('dark green','dark green','dark red','dark red'),size=1,alpha=0.8) +
#scale_y_continuous(limits=c(min(df),max(df)))+
xlab("Actual")+
ylab("Predicted")
}
n <- names(data.train)
formula <- as.formula(paste(paste(n[n %in% label.names], collapse = " + ")
," ~", paste(n[!n %in% label.names], collapse = " + ")))
grand.mean.formula = as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~ 1"))
print(formula)
## y3 ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
## PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 +
## PC20 + PC21 + PC22 + PC23 + PC24 + PC25 + PC26 + PC27 + PC28 +
## PC29 + PC30 + PC31 + PC32 + PC33 + PC34 + PC35 + PC36 + PC37 +
## PC38 + PC39 + PC40 + PC41 + PC42 + PC43 + PC44 + PC45 + PC46 +
## PC47 + PC48 + PC49 + PC50 + PC51 + PC52 + PC53 + PC54 + PC55 +
## PC56 + PC57 + PC58 + PC59 + PC60 + PC61 + PC62 + PC63 + PC64 +
## PC65 + PC66 + PC67 + PC68 + PC69 + PC70 + PC71 + PC72 + PC73 +
## PC74 + PC75 + PC76 + PC77 + PC78 + PC79 + PC80 + PC81 + PC82 +
## PC83 + PC84 + PC85 + PC86 + PC87 + PC88 + PC89 + PC90 + PC91 +
## PC92 + PC93 + PC94 + PC95 + PC96 + PC97 + PC98 + PC99 + PC100 +
## PC101 + PC102 + PC103 + PC104 + PC105 + PC106 + PC107 + PC108 +
## PC109 + PC110 + PC111 + PC112 + PC113 + PC114 + PC115 + PC116 +
## PC117 + PC118 + PC119 + PC120 + PC121 + PC122 + PC123 + PC124 +
## PC125 + PC126 + PC127 + PC128 + PC129 + PC130 + PC131 + PC132 +
## PC133 + PC134 + PC135 + PC136 + PC137 + PC138 + PC139 + PC140 +
## PC141 + PC142 + PC143 + PC144 + PC145 + PC146 + PC147 + PC148 +
## PC149 + PC150 + PC151 + PC152 + PC153 + PC154 + PC155 + PC156 +
## PC157 + PC158 + PC159 + PC160 + PC161 + PC162 + PC163 + PC164
print(grand.mean.formula)
## y3 ~ 1
# Update feature.names because we may have transformed some features
feature.names = n[!n %in% label.names]
model.full = lm(formula , data.train)
summary(model.full)
##
## Call:
## lm(formula = formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.493 -6.557 -1.973 4.688 65.905
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.255e+02 1.276e-01 983.285 < 2e-16 ***
## PC1 -1.314e-01 1.116e-02 -11.776 < 2e-16 ***
## PC2 -2.565e-01 1.143e-02 -22.445 < 2e-16 ***
## PC3 -1.193e-01 1.128e-02 -10.571 < 2e-16 ***
## PC4 -9.625e-02 1.153e-02 -8.350 < 2e-16 ***
## PC5 5.744e-02 1.194e-02 4.809 1.56e-06 ***
## PC6 -2.652e-02 1.186e-02 -2.236 0.025390 *
## PC7 -5.910e-02 1.227e-02 -4.818 1.49e-06 ***
## PC8 -1.816e-02 1.239e-02 -1.465 0.142893
## PC9 -2.035e-02 1.265e-02 -1.608 0.107803
## PC10 -1.388e-03 1.284e-02 -0.108 0.913924
## PC11 -1.625e-01 1.383e-02 -11.746 < 2e-16 ***
## PC12 -1.316e-01 1.470e-02 -8.952 < 2e-16 ***
## PC13 8.686e-02 1.470e-02 5.910 3.64e-09 ***
## PC14 6.855e-02 1.543e-02 4.442 9.08e-06 ***
## PC15 -1.497e-02 1.566e-02 -0.956 0.339084
## PC16 9.516e-02 1.588e-02 5.992 2.21e-09 ***
## PC17 -4.468e-02 1.669e-02 -2.676 0.007468 **
## PC18 -1.007e-01 1.745e-02 -5.769 8.41e-09 ***
## PC19 2.794e-02 1.753e-02 1.594 0.111000
## PC20 1.203e-01 1.910e-02 6.299 3.24e-10 ***
## PC21 2.989e-02 1.998e-02 1.496 0.134722
## PC22 1.475e-02 3.121e-02 0.473 0.636451
## PC23 4.788e-02 3.850e-02 1.244 0.213634
## PC24 -2.269e-01 4.545e-02 -4.993 6.14e-07 ***
## PC25 6.463e-02 5.098e-02 1.268 0.204962
## PC26 1.219e-01 5.249e-02 2.322 0.020295 *
## PC27 4.439e-02 5.240e-02 0.847 0.396875
## PC28 5.824e-02 5.299e-02 1.099 0.271758
## PC29 1.218e-01 5.778e-02 2.108 0.035081 *
## PC30 -3.525e-02 5.967e-02 -0.591 0.554740
## PC31 -7.716e-02 6.316e-02 -1.222 0.221895
## PC32 -2.310e-01 6.459e-02 -3.576 0.000352 ***
## PC33 1.223e-01 6.630e-02 1.845 0.065058 .
## PC34 3.644e-01 6.979e-02 5.221 1.85e-07 ***
## PC35 3.059e-02 7.475e-02 0.409 0.682367
## PC36 -6.961e-03 7.547e-02 -0.092 0.926515
## PC37 -1.200e-01 7.743e-02 -1.549 0.121382
## PC38 1.275e-01 8.115e-02 1.571 0.116249
## PC39 -4.824e-02 8.398e-02 -0.574 0.565696
## PC40 -6.934e-02 8.371e-02 -0.828 0.407512
## PC41 -9.138e-02 8.519e-02 -1.073 0.283465
## PC42 -9.046e-02 8.497e-02 -1.064 0.287153
## PC43 -6.622e-02 8.662e-02 -0.764 0.444619
## PC44 1.658e-01 8.718e-02 1.902 0.057204 .
## PC45 -4.507e-02 8.689e-02 -0.519 0.603969
## PC46 6.487e-02 8.844e-02 0.733 0.463298
## PC47 -9.889e-02 8.748e-02 -1.130 0.258350
## PC48 4.939e-02 8.878e-02 0.556 0.578051
## PC49 7.835e-02 8.990e-02 0.872 0.383516
## PC50 -3.656e-02 9.198e-02 -0.397 0.691026
## PC51 7.219e-02 9.164e-02 0.788 0.430870
## PC52 -6.456e-02 9.251e-02 -0.698 0.485259
## PC53 6.019e-02 9.150e-02 0.658 0.510685
## PC54 -1.455e-02 9.297e-02 -0.156 0.875656
## PC55 -2.182e-02 9.305e-02 -0.234 0.814636
## PC56 2.786e-02 9.369e-02 0.297 0.766222
## PC57 -1.265e-01 9.484e-02 -1.334 0.182137
## PC58 2.477e-02 9.578e-02 0.259 0.795992
## PC59 2.683e-01 9.516e-02 2.819 0.004829 **
## PC60 -1.652e-02 9.565e-02 -0.173 0.862854
## PC61 1.673e-02 9.549e-02 0.175 0.860965
## PC62 -2.804e-02 9.560e-02 -0.293 0.769258
## PC63 -2.031e-01 9.684e-02 -2.097 0.036055 *
## PC64 -2.418e-01 9.815e-02 -2.464 0.013789 *
## PC65 1.619e-02 9.768e-02 0.166 0.868374
## PC66 -1.283e-01 9.891e-02 -1.297 0.194788
## PC67 2.531e-02 9.828e-02 0.258 0.796778
## PC68 2.032e-01 9.976e-02 2.037 0.041713 *
## PC69 1.429e-01 1.004e-01 1.422 0.155040
## PC70 5.304e-03 9.987e-02 0.053 0.957645
## PC71 2.150e-01 1.009e-01 2.129 0.033276 *
## PC72 -4.956e-02 1.009e-01 -0.491 0.623386
## PC73 1.077e-01 1.022e-01 1.053 0.292180
## PC74 -1.069e-01 1.021e-01 -1.047 0.295103
## PC75 -3.526e-01 1.015e-01 -3.474 0.000517 ***
## PC76 2.490e-02 1.020e-01 0.244 0.807122
## PC77 7.949e-02 1.022e-01 0.778 0.436874
## PC78 1.256e-01 1.027e-01 1.223 0.221274
## PC79 1.773e-01 1.034e-01 1.715 0.086335 .
## PC80 -3.832e-02 1.051e-01 -0.365 0.715320
## PC81 3.223e-01 1.040e-01 3.098 0.001961 **
## PC82 8.857e-02 1.057e-01 0.838 0.401881
## PC83 -2.848e-01 1.056e-01 -2.696 0.007042 **
## PC84 1.996e-01 1.052e-01 1.897 0.057912 .
## PC85 3.202e-01 1.061e-01 3.017 0.002565 **
## PC86 -9.104e-02 1.068e-01 -0.852 0.394044
## PC87 4.636e-01 1.077e-01 4.305 1.70e-05 ***
## PC88 -2.475e-01 1.076e-01 -2.300 0.021474 *
## PC89 -3.094e-01 1.083e-01 -2.858 0.004280 **
## PC90 -2.220e-01 1.083e-01 -2.051 0.040364 *
## PC91 -6.776e-03 1.088e-01 -0.062 0.950352
## PC92 2.924e-02 1.086e-01 0.269 0.787714
## PC93 4.317e-02 1.088e-01 0.397 0.691647
## PC94 -1.430e-01 1.103e-01 -1.296 0.195076
## PC95 5.299e-02 1.096e-01 0.484 0.628692
## PC96 -1.468e-01 1.093e-01 -1.342 0.179595
## PC97 -1.587e-01 1.098e-01 -1.445 0.148459
## PC98 -8.847e-02 1.105e-01 -0.801 0.423243
## PC99 -8.693e-02 1.095e-01 -0.794 0.427371
## PC100 -6.082e-02 1.112e-01 -0.547 0.584425
## PC101 2.001e-02 1.108e-01 0.181 0.856712
## PC102 -2.611e-01 1.110e-01 -2.352 0.018731 *
## PC103 1.297e-01 1.108e-01 1.170 0.241996
## PC104 -1.933e-01 1.109e-01 -1.742 0.081486 .
## PC105 2.262e-01 1.114e-01 2.032 0.042245 *
## PC106 2.189e-01 1.121e-01 1.953 0.050868 .
## PC107 3.391e-02 1.133e-01 0.299 0.764801
## PC108 3.504e-02 1.114e-01 0.315 0.753126
## PC109 1.457e-01 1.127e-01 1.292 0.196302
## PC110 -9.663e-02 1.132e-01 -0.854 0.393359
## PC111 -1.946e-01 1.120e-01 -1.737 0.082430 .
## PC112 -1.582e-02 1.125e-01 -0.141 0.888240
## PC113 1.620e-01 1.131e-01 1.432 0.152074
## PC114 -1.121e-01 1.135e-01 -0.987 0.323614
## PC115 -4.110e-01 1.132e-01 -3.632 0.000284 ***
## PC116 -1.165e-01 1.137e-01 -1.025 0.305537
## PC117 -9.854e-02 1.136e-01 -0.867 0.385805
## PC118 7.659e-02 1.132e-01 0.676 0.498900
## PC119 -1.523e-01 1.142e-01 -1.333 0.182528
## PC120 -8.010e-03 1.144e-01 -0.070 0.944164
## PC121 -8.655e-02 1.152e-01 -0.751 0.452468
## PC122 2.160e-01 1.139e-01 1.896 0.058041 .
## PC123 -2.611e-01 1.156e-01 -2.257 0.024021 *
## PC124 8.865e-02 1.148e-01 0.772 0.440003
## PC125 9.813e-02 1.161e-01 0.845 0.398033
## PC126 4.249e-02 1.144e-01 0.371 0.710325
## PC127 8.459e-02 1.151e-01 0.735 0.462283
## PC128 -2.500e-01 1.154e-01 -2.167 0.030274 *
## PC129 1.186e-01 1.157e-01 1.025 0.305444
## PC130 1.379e-01 1.148e-01 1.201 0.229755
## PC131 -3.893e-01 1.159e-01 -3.358 0.000789 ***
## PC132 9.554e-02 1.166e-01 0.819 0.412659
## PC133 -1.364e-01 1.173e-01 -1.163 0.244820
## PC134 2.473e-01 1.162e-01 2.129 0.033298 *
## PC135 1.493e-01 1.166e-01 1.280 0.200739
## PC136 1.085e-01 1.166e-01 0.931 0.352145
## PC137 -1.633e-01 1.167e-01 -1.400 0.161660
## PC138 1.711e-01 1.176e-01 1.455 0.145689
## PC139 -3.801e-01 1.175e-01 -3.234 0.001226 **
## PC140 -1.841e-01 1.172e-01 -1.571 0.116139
## PC141 -2.600e-02 1.179e-01 -0.220 0.825511
## PC142 -3.962e-02 1.172e-01 -0.338 0.735387
## PC143 1.599e-01 1.178e-01 1.357 0.174927
## PC144 2.729e-01 1.186e-01 2.300 0.021467 *
## PC145 1.552e-01 1.191e-01 1.303 0.192599
## PC146 3.196e-01 1.179e-01 2.711 0.006731 **
## PC147 -1.105e-01 1.184e-01 -0.933 0.350625
## PC148 -1.157e-01 1.182e-01 -0.978 0.327878
## PC149 3.633e-04 1.191e-01 0.003 0.997566
## PC150 8.536e-02 1.194e-01 0.715 0.474623
## PC151 2.150e-01 1.193e-01 1.803 0.071404 .
## PC152 -1.065e-01 1.193e-01 -0.893 0.372066
## PC153 2.921e-01 1.195e-01 2.444 0.014556 *
## PC154 -2.298e-01 1.189e-01 -1.933 0.053295 .
## PC155 2.115e-01 1.200e-01 1.762 0.078191 .
## PC156 3.522e-01 1.204e-01 2.925 0.003464 **
## PC157 -2.725e-02 1.194e-01 -0.228 0.819438
## PC158 -1.147e-01 1.187e-01 -0.967 0.333710
## PC159 6.387e-01 1.202e-01 5.315 1.11e-07 ***
## PC160 -1.245e-01 1.198e-01 -1.040 0.298465
## PC161 7.256e-02 1.206e-01 0.602 0.547385
## PC162 -2.524e-01 1.213e-01 -2.081 0.037519 *
## PC163 3.066e-01 1.206e-01 2.543 0.011032 *
## PC164 3.248e-02 1.206e-01 0.269 0.787699
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.508 on 5419 degrees of freedom
## Multiple R-squared: 0.2435, Adjusted R-squared: 0.2206
## F-statistic: 10.64 on 164 and 5419 DF, p-value: < 2.2e-16
cd.full = plot.diagnostics(model=model.full, train=data.train)
## [1] "Number of data points that have Cook's D > 4/n: 266"
## [1] "Number of data points that have Cook's D > 1: 0"
high.cd = names(cd.full[cd.full > 4/nrow(data.train)])
#save dataset with high.cd flagged
t = data.train %>%
rownames_to_column() %>%
mutate(high.cd = ifelse(rowname %in% high.cd,1,0))
#write.csv(t,file='data_high_cd_flag.csv',row.names = F)
###
data.train2 = data.train[!(rownames(data.train)) %in% high.cd,]
model.full2 = lm(formula , data.train2)
summary(model.full2)
##
## Call:
## lm(formula = formula, data = data.train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.617 -5.585 -1.286 4.731 25.321
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.243e+02 1.033e-01 1203.068 < 2e-16 ***
## PC1 -1.427e-01 9.276e-03 -15.387 < 2e-16 ***
## PC2 -2.507e-01 9.287e-03 -26.992 < 2e-16 ***
## PC3 -1.219e-01 9.213e-03 -13.229 < 2e-16 ***
## PC4 -1.027e-01 9.345e-03 -10.990 < 2e-16 ***
## PC5 5.661e-02 9.727e-03 5.821 6.22e-09 ***
## PC6 -1.624e-02 9.627e-03 -1.686 0.091776 .
## PC7 -6.191e-02 9.935e-03 -6.232 4.97e-10 ***
## PC8 -1.627e-02 1.008e-02 -1.615 0.106398
## PC9 -5.185e-03 1.027e-02 -0.505 0.613647
## PC10 6.245e-03 1.044e-02 0.598 0.549811
## PC11 -1.844e-01 1.121e-02 -16.441 < 2e-16 ***
## PC12 -1.320e-01 1.185e-02 -11.131 < 2e-16 ***
## PC13 8.002e-02 1.189e-02 6.729 1.89e-11 ***
## PC14 5.951e-02 1.245e-02 4.779 1.81e-06 ***
## PC15 -2.534e-02 1.271e-02 -1.994 0.046212 *
## PC16 8.283e-02 1.284e-02 6.452 1.20e-10 ***
## PC17 -4.853e-02 1.352e-02 -3.589 0.000335 ***
## PC18 -9.870e-02 1.410e-02 -7.001 2.86e-12 ***
## PC19 3.513e-02 1.420e-02 2.474 0.013376 *
## PC20 1.243e-01 1.547e-02 8.033 1.17e-15 ***
## PC21 2.894e-02 1.616e-02 1.790 0.073475 .
## PC22 3.944e-02 2.521e-02 1.564 0.117830
## PC23 5.557e-02 3.188e-02 1.743 0.081363 .
## PC24 -2.297e-01 3.709e-02 -6.192 6.41e-10 ***
## PC25 5.763e-02 4.149e-02 1.389 0.164886
## PC26 4.323e-02 4.277e-02 1.011 0.312257
## PC27 -9.779e-03 4.282e-02 -0.228 0.819383
## PC28 3.267e-02 4.313e-02 0.758 0.448783
## PC29 1.379e-01 4.686e-02 2.943 0.003266 **
## PC30 -3.179e-02 4.869e-02 -0.653 0.513892
## PC31 -7.562e-02 5.116e-02 -1.478 0.139440
## PC32 -2.310e-01 5.227e-02 -4.419 1.01e-05 ***
## PC33 3.962e-02 5.404e-02 0.733 0.463572
## PC34 3.733e-01 5.660e-02 6.595 4.68e-11 ***
## PC35 1.081e-01 6.148e-02 1.757 0.078900 .
## PC36 -2.624e-02 6.164e-02 -0.426 0.670339
## PC37 -1.410e-01 6.314e-02 -2.233 0.025606 *
## PC38 1.336e-01 6.588e-02 2.028 0.042614 *
## PC39 -1.600e-02 7.275e-02 -0.220 0.825992
## PC40 -1.251e-01 6.883e-02 -1.817 0.069290 .
## PC41 -1.049e-01 7.005e-02 -1.497 0.134339
## PC42 8.690e-03 7.027e-02 0.124 0.901589
## PC43 8.878e-03 7.149e-02 0.124 0.901177
## PC44 7.788e-02 7.300e-02 1.067 0.286134
## PC45 -3.559e-02 7.150e-02 -0.498 0.618722
## PC46 1.822e-02 7.256e-02 0.251 0.801775
## PC47 -7.964e-02 7.180e-02 -1.109 0.267435
## PC48 1.001e-01 7.250e-02 1.381 0.167381
## PC49 1.218e-01 7.363e-02 1.655 0.098087 .
## PC50 -8.416e-02 7.591e-02 -1.109 0.267614
## PC51 8.061e-02 7.530e-02 1.071 0.284441
## PC52 -1.026e-01 7.631e-02 -1.345 0.178727
## PC53 1.451e-01 7.498e-02 1.935 0.053002 .
## PC54 -8.312e-02 7.707e-02 -1.079 0.280858
## PC55 -1.190e-01 7.678e-02 -1.549 0.121372
## PC56 2.022e-02 7.803e-02 0.259 0.795500
## PC57 -1.742e-01 7.777e-02 -2.240 0.025156 *
## PC58 -1.088e-01 7.889e-02 -1.380 0.167761
## PC59 3.036e-01 7.824e-02 3.881 0.000106 ***
## PC60 -5.719e-02 7.846e-02 -0.729 0.466083
## PC61 -1.027e-02 7.769e-02 -0.132 0.894829
## PC62 8.936e-03 7.831e-02 0.114 0.909145
## PC63 -8.873e-02 7.939e-02 -1.118 0.263757
## PC64 -2.063e-01 8.016e-02 -2.574 0.010088 *
## PC65 -2.719e-02 7.982e-02 -0.341 0.733400
## PC66 -4.315e-02 8.177e-02 -0.528 0.597741
## PC67 1.446e-02 8.077e-02 0.179 0.857950
## PC68 2.367e-01 8.186e-02 2.892 0.003849 **
## PC69 1.190e-01 8.260e-02 1.440 0.149864
## PC70 6.598e-02 8.106e-02 0.814 0.415709
## PC71 4.770e-02 8.235e-02 0.579 0.562478
## PC72 -2.955e-02 8.231e-02 -0.359 0.719610
## PC73 1.316e-01 8.325e-02 1.580 0.114101
## PC74 2.127e-02 8.338e-02 0.255 0.798661
## PC75 -2.239e-01 8.265e-02 -2.709 0.006774 **
## PC76 -5.587e-03 8.300e-02 -0.067 0.946338
## PC77 6.900e-02 8.281e-02 0.833 0.404739
## PC78 -1.845e-02 8.346e-02 -0.221 0.825022
## PC79 1.765e-01 8.440e-02 2.091 0.036540 *
## PC80 -3.833e-02 8.501e-02 -0.451 0.652118
## PC81 3.311e-01 8.468e-02 3.910 9.36e-05 ***
## PC82 3.385e-02 8.570e-02 0.395 0.692894
## PC83 -2.719e-01 8.632e-02 -3.150 0.001645 **
## PC84 1.210e-01 8.582e-02 1.410 0.158592
## PC85 3.931e-01 8.688e-02 4.524 6.20e-06 ***
## PC86 -1.031e-02 8.688e-02 -0.119 0.905580
## PC87 3.282e-01 8.748e-02 3.752 0.000177 ***
## PC88 -2.102e-01 8.766e-02 -2.397 0.016545 *
## PC89 -2.681e-01 8.811e-02 -3.043 0.002354 **
## PC90 -1.820e-01 8.810e-02 -2.065 0.038926 *
## PC91 -9.230e-02 8.810e-02 -1.048 0.294826
## PC92 1.016e-01 8.791e-02 1.155 0.247964
## PC93 -6.293e-02 8.881e-02 -0.709 0.478620
## PC94 -8.048e-02 8.943e-02 -0.900 0.368205
## PC95 6.416e-02 8.931e-02 0.718 0.472528
## PC96 -1.337e-01 8.852e-02 -1.511 0.130944
## PC97 -1.015e-01 8.914e-02 -1.139 0.254743
## PC98 -1.896e-01 8.970e-02 -2.114 0.034567 *
## PC99 -5.066e-02 8.915e-02 -0.568 0.569859
## PC100 -1.083e-01 8.974e-02 -1.207 0.227533
## PC101 -7.877e-02 9.004e-02 -0.875 0.381706
## PC102 -1.599e-01 9.000e-02 -1.776 0.075732 .
## PC103 1.052e-01 9.011e-02 1.168 0.242836
## PC104 -1.582e-01 8.975e-02 -1.763 0.078029 .
## PC105 2.586e-01 9.046e-02 2.859 0.004270 **
## PC106 1.340e-01 9.075e-02 1.477 0.139838
## PC107 7.566e-03 9.254e-02 0.082 0.934843
## PC108 -7.328e-04 9.029e-02 -0.008 0.993525
## PC109 1.244e-01 9.130e-02 1.363 0.172991
## PC110 -1.104e-01 9.174e-02 -1.204 0.228712
## PC111 -2.983e-01 9.098e-02 -3.279 0.001050 **
## PC112 -4.298e-02 9.125e-02 -0.471 0.637668
## PC113 1.089e-01 9.160e-02 1.189 0.234389
## PC114 -1.158e-01 9.198e-02 -1.259 0.208024
## PC115 -3.993e-01 9.190e-02 -4.344 1.42e-05 ***
## PC116 -6.362e-02 9.217e-02 -0.690 0.490100
## PC117 -6.312e-02 9.198e-02 -0.686 0.492617
## PC118 7.939e-03 9.188e-02 0.086 0.931146
## PC119 -1.345e-01 9.257e-02 -1.453 0.146266
## PC120 6.810e-02 9.303e-02 0.732 0.464219
## PC121 -1.250e-01 9.329e-02 -1.339 0.180505
## PC122 1.175e-01 9.218e-02 1.275 0.202461
## PC123 -1.792e-01 9.363e-02 -1.914 0.055707 .
## PC124 -2.287e-02 9.334e-02 -0.245 0.806455
## PC125 2.710e-01 9.407e-02 2.881 0.003981 **
## PC126 7.243e-02 9.246e-02 0.783 0.433423
## PC127 -5.848e-03 9.303e-02 -0.063 0.949879
## PC128 -2.365e-01 9.340e-02 -2.532 0.011366 *
## PC129 9.532e-02 9.428e-02 1.011 0.312077
## PC130 1.383e-01 9.291e-02 1.488 0.136692
## PC131 -2.668e-01 9.387e-02 -2.843 0.004489 **
## PC132 7.647e-02 9.440e-02 0.810 0.417930
## PC133 -9.252e-02 9.554e-02 -0.968 0.332923
## PC134 2.200e-01 9.426e-02 2.334 0.019631 *
## PC135 2.773e-02 9.424e-02 0.294 0.768557
## PC136 1.697e-01 9.470e-02 1.792 0.073135 .
## PC137 -2.052e-01 9.421e-02 -2.179 0.029400 *
## PC138 1.159e-01 9.524e-02 1.217 0.223633
## PC139 -2.624e-01 9.545e-02 -2.749 0.005991 **
## PC140 -2.629e-01 9.487e-02 -2.771 0.005613 **
## PC141 7.418e-02 9.565e-02 0.776 0.438038
## PC142 5.092e-02 9.493e-02 0.536 0.591685
## PC143 1.453e-01 9.540e-02 1.523 0.127856
## PC144 1.509e-01 9.625e-02 1.567 0.117113
## PC145 2.393e-01 9.691e-02 2.469 0.013583 *
## PC146 3.795e-01 9.523e-02 3.985 6.85e-05 ***
## PC147 -1.223e-01 9.651e-02 -1.268 0.204983
## PC148 -1.202e-01 9.552e-02 -1.259 0.208223
## PC149 -4.055e-02 9.629e-02 -0.421 0.673711
## PC150 1.357e-01 9.685e-02 1.401 0.161313
## PC151 2.621e-01 9.651e-02 2.716 0.006626 **
## PC152 -9.567e-02 9.674e-02 -0.989 0.322711
## PC153 1.776e-01 9.644e-02 1.841 0.065639 .
## PC154 -1.860e-01 9.659e-02 -1.926 0.054147 .
## PC155 1.635e-01 9.692e-02 1.687 0.091627 .
## PC156 2.186e-01 9.762e-02 2.239 0.025187 *
## PC157 -2.338e-02 9.697e-02 -0.241 0.809455
## PC158 -7.160e-02 9.630e-02 -0.743 0.457235
## PC159 4.601e-01 9.734e-02 4.727 2.34e-06 ***
## PC160 -1.036e-01 9.757e-02 -1.062 0.288341
## PC161 8.399e-03 9.768e-02 0.086 0.931481
## PC162 -2.937e-01 9.822e-02 -2.990 0.002801 **
## PC163 2.100e-01 9.817e-02 2.139 0.032443 *
## PC164 5.438e-02 9.772e-02 0.556 0.577905
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.498 on 5153 degrees of freedom
## Multiple R-squared: 0.3365, Adjusted R-squared: 0.3153
## F-statistic: 15.93 on 164 and 5153 DF, p-value: < 2.2e-16
cd.full2 = plot.diagnostics(model.full2, data.train2)
## [1] "Number of data points that have Cook's D > 4/n: 249"
## [1] "Number of data points that have Cook's D > 1: 0"
# much more normal residuals than before.
# Checking to see if distributions are different and if so whcih variables
# High Leverage Plot
plotData = data.train %>%
rownames_to_column() %>%
mutate(type=ifelse(rowname %in% high.cd,'High','Normal')) %>%
dplyr::select(type,target=one_of(label.names))
ggplot(data=plotData, aes(x=type,y=target)) +
geom_boxplot(fill='light blue',outlier.shape=NA) +
scale_y_continuous(name="Target Variable Values",label=scales::comma_format(accuracy=.1)) +
theme_light() +
ggtitle('Distribution of High Leverage Points and Normal Points')
# 2 sample t-tests
plotData = data.train %>%
rownames_to_column() %>%
mutate(type=ifelse(rowname %in% high.cd,'High','Normal')) %>%
dplyr::select(type,one_of(feature.names))
comp.test = lapply(dplyr::select(plotData, one_of(feature.names))
, function(x) t.test(x ~ plotData$type, var.equal = TRUE))
sig.comp = list.filter(comp.test, p.value < 0.05)
sapply(sig.comp, function(x) x[['p.value']])
## PC1 PC6 PC11 PC23 PC24 PC26 PC27 PC33 PC39
## 7.486933e-08 1.932209e-02 7.663853e-03 4.398961e-06 4.811195e-02 6.985612e-05 1.410329e-02 4.718986e-02 8.616148e-03
## PC41 PC42 PC44 PC45 PC58 PC75 PC78 PC87 PC124
## 4.056784e-02 1.921638e-03 1.483273e-04 6.028062e-03 9.238373e-03 1.215051e-02 1.669496e-02 1.959496e-02 4.710428e-02
## PC125 PC131 PC159 PC161
## 3.851990e-02 8.989203e-03 1.361394e-02 4.951842e-02
mm = melt(plotData, id=c('type')) %>% filter(variable %in% names(sig.comp))
ggplot(mm,aes(x=type, y=value)) +
geom_boxplot()+
facet_wrap(~variable, ncol=5, scales = 'free_y') +
scale_y_continuous(name="values",label=scales::comma_format(accuracy=.1)) +
ggtitle('Distribution of High Leverage Points and Normal Points')
# Distribution (box) Plots
mm = melt(plotData, id=c('type'))
ggplot(mm,aes(x=type, y=value)) +
geom_boxplot()+
facet_wrap(~variable, ncol=8, scales = 'free_y') +
scale_y_continuous(name="values",label=scales::comma_format(accuracy=.1)) +
ggtitle('Distribution of High Leverage Points and Normal Points')
model.null = lm(grand.mean.formula, data.train)
summary(model.null)
##
## Call:
## lm(formula = grand.mean.formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.533 -7.142 -1.395 5.644 68.236
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 125.4903 0.1441 870.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.77 on 5583 degrees of freedom
Basic: http://www.stat.columbia.edu/~martin/W2024/R10.pdf Cross Validation + Other Metrics: http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/
if (algo.forward.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
, data = data.train
, method = "leapForward"
, feature.names = feature.names)
model.forward = returned$model
id = returned$id
}
if (algo.forward.caret == TRUE){
test.model(model=model.forward, test=data.test
,method = 'leapForward',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
if (algo.backward.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "leapBackward"
,feature.names = feature.names)
model.backward = returned$model
id = returned$id
}
if (algo.backward.caret == TRUE){
test.model(model.backward, data.test
,method = 'leapBackward',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
if (algo.stepwise.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "leapSeq"
,feature.names = feature.names)
model.stepwise = returned$model
id = returned$id
}
if (algo.stepwise.caret == TRUE){
test.model(model.stepwise, data.test
,method = 'leapSeq',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
if (algo.LASSO.caret == TRUE){
set.seed(1)
tune.grid= expand.grid(alpha = 1,lambda = 10^seq(from=-4,to=-2,length=100))
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "glmnet"
,subopt = 'LASSO'
,tune.grid = tune.grid
,feature.names = feature.names)
model.LASSO.caret = returned$model
}
if (algo.LASSO.caret == TRUE){
test.model(model.LASSO.caret, data.test
,method = 'glmnet',subopt = "LASSO"
,formula = formula, feature.names = feature.names, label.names = label.names
,draw.limits = TRUE, transformation = t)
}
if (algo.LARS.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "lars"
,subopt = 'NULL'
,feature.names = feature.names)
model.LARS.caret = returned$model
}
if (algo.LARS.caret == TRUE){
test.model(model.LARS.caret, data.test
,method = 'lars',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,draw.limits = TRUE, transformation = t)
}
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252 LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C LC_TIME=English_United States.1252
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 knitr_1.20 htmltools_0.3.6 reshape2_1.4.3
## [5] lars_1.2 doParallel_1.0.14 iterators_1.0.10 caret_6.0-81
## [9] leaps_3.0 ggforce_0.1.3 rlist_0.4.6.1 car_3.0-2
## [13] carData_3.0-2 bestNormalize_1.3.0 scales_1.0.0 onewaytests_2.0
## [17] caTools_1.17.1.1 mosaic_1.5.0 mosaicData_0.17.0 ggformula_0.9.1
## [21] ggstance_0.3.1 lattice_0.20-35 DT_0.5 ggiraph_0.6.0
## [25] investr_1.4.0 glmnet_2.0-16 foreach_1.4.4 Matrix_1.2-14
## [29] MASS_7.3-50 PerformanceAnalytics_1.5.2 xts_0.11-2 zoo_1.8-4
## [33] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8 purrr_0.2.5
## [37] readr_1.3.1 tidyr_0.8.2 tibble_1.4.2 ggplot2_3.1.0
## [41] tidyverse_1.2.1 usdm_1.1-18 raster_2.8-4 sp_1.3-1
## [45] pacman_0.5.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.2.0 backports_1.1.3 plyr_1.8.4 lazyeval_0.2.1 splines_3.5.1 mycor_0.1.1
## [7] crosstalk_1.0.0 leaflet_2.0.2 digest_0.6.18 magrittr_1.5 mosaicCore_0.6.0 openxlsx_4.1.0
## [13] recipes_0.1.4 modelr_0.1.2 gower_0.1.2 colorspace_1.3-2 rvest_0.3.2 ggrepel_0.8.0
## [19] haven_2.0.0 crayon_1.3.4 jsonlite_1.5 bindr_0.1.1 survival_2.42-3 glue_1.3.0
## [25] registry_0.5 gtable_0.2.0 ppcor_1.1 ipred_0.9-8 abind_1.4-5 rngtools_1.3.1
## [31] bibtex_0.4.2 Rcpp_1.0.0 xtable_1.8-3 units_0.6-2 foreign_0.8-70 stats4_3.5.1
## [37] lava_1.6.4 prodlim_2018.04.18 htmlwidgets_1.3 httr_1.4.0 RColorBrewer_1.1-2 pkgconfig_2.0.2
## [43] farver_1.1.0 nnet_7.3-12 labeling_0.3 tidyselect_0.2.5 rlang_0.3.1 later_0.7.5
## [49] munsell_0.5.0 cellranger_1.1.0 tools_3.5.1 cli_1.0.1 generics_0.0.2 moments_0.14
## [55] sjlabelled_1.0.17 broom_0.5.1 evaluate_0.12 ggdendro_0.1-20 yaml_2.2.0 ModelMetrics_1.2.2
## [61] zip_2.0.1 nlme_3.1-137 doRNG_1.7.1 mime_0.6 xml2_1.2.0 compiler_3.5.1
## [67] rstudioapi_0.8 curl_3.2 tweenr_1.0.1 stringi_1.2.4 highr_0.7 gdtools_0.1.7
## [73] pillar_1.3.1 data.table_1.11.8 bitops_1.0-6 insight_0.1.2 httpuv_1.4.5 R6_2.3.0
## [79] promises_1.0.1 gridExtra_2.3 rio_0.5.16 codetools_0.2-15 assertthat_0.2.0 pkgmaker_0.27
## [85] withr_2.1.2 nortest_1.0-4 mgcv_1.8-24 hms_0.4.2 quadprog_1.5-5 grid_3.5.1
## [91] rpart_4.1-13 timeDate_3043.102 class_7.3-14 rmarkdown_1.11 shiny_1.2.0 lubridate_1.7.4